# ===============================================
# RX 480 Full VRAM + Compute Saturation Benchmark (Fixed Kernel Call)
# ===============================================
import pyopencl as cl
import numpy as np
import time

# ---------------------------
# Auto-detect GPU
# ---------------------------
device = None
for platform in cl.get_platforms():
    for d in platform.get_devices():
        if d.type & cl.device_type.GPU:
            device = d
            break
    if device:
        break

if device is None:
    raise RuntimeError("No GPU found via OpenCL")

ctx = cl.Context([device])
queue = cl.CommandQueue(ctx)

print("Using device:", device.name)
vram_bytes = device.global_mem_size
print(f"VRAM: {vram_bytes/1024**3:.2f} GB")
print("Compute Units:", device.max_compute_units)
print("Max Clock (MHz):", device.max_clock_frequency)

# ---------------------------
# Prismatic Recursion Model
# ---------------------------
def expansion(depth):
    return 8**depth

seed_size = 64
max_total_seeds = vram_bytes // seed_size
print(f"Max seeds based on VRAM: {max_total_seeds:,}")

# ---------------------------
# OpenCL kernel
# ---------------------------
kernel_code = """
__kernel void recurse(__global float *data, const int expansion)
{
    int gid = get_global_id(0);
    float x = data[gid];
    for(int i=0; i<expansion; i++){
        x = sqrt(x * 1.618f + 0.5f) * 1.0001f;
    }
    data[gid] = x;
}
"""
program = cl.Program(ctx, kernel_code).build()
kernel = program.recurse  # reuse kernel

# ---------------------------
# Chunked ceiling finder
# ---------------------------
target_fps = 1.0
chunk_size = 2**24  # 16M seeds per chunk
depth = 1
total_seeds = max_total_seeds
num_chunks = (total_seeds + chunk_size - 1) // chunk_size

while True:
    expansion_factor = expansion(depth)
    print(f"\n[Testing] Depth={depth}, Expansion={expansion_factor:,}, Chunks={num_chunks}")

    start_time = time.time()
    flops_total = 0.0

    for chunk_idx in range(num_chunks):
        current_chunk = min(chunk_size, total_seeds - chunk_idx*chunk_size)
        data = np.random.rand(current_chunk).astype(np.float32)
        buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)

        # Set kernel args
        kernel.set_args(buf, np.int32(expansion_factor))

        # Warmup
        cl.enqueue_nd_range_kernel(queue, kernel, (current_chunk,), None).wait()

        # Timed run
        t0 = time.time()
        cl.enqueue_nd_range_kernel(queue, kernel, (current_chunk,), None).wait()
        dt = time.time() - t0

        flops_total += current_chunk * expansion_factor / dt / 1e9  # GFLOPs

    total_time = time.time() - start_time
    fps = 1.0 / total_time
    vram_used_mb = total_seeds * seed_size / 1024**2

    print(f"Depth {depth} | Total Seeds={total_seeds:,} | VRAM={vram_used_mb:.1f} MB | "
          f"{fps:.2f} FPS | {flops_total:.2f} GFLOPs")

    if fps < target_fps:
        print("\n[Ceiling reached]")
        break

    depth += 1
